#include <sys/stat.h>
#include <dirent.h>
#include <errno.h>
#include <ctype.h>
#include <glib.h>
#include <math.h>

#include "bss.h"
#include "../seq/tree.h"

#define FSCANLINE(x) fscanf(readfile,"%1023[^\n]\n",x)


extern char dbDir[];
extern int clones_matched;
extern int clones_not_matched;
extern int use_blast_flag;
extern int use_megablast_flag;
extern int use_blat_flag;
extern long double eVal;
extern int blatscore;
extern char xtraParm[];
extern int defCloneName(char *stcname, char *clonename);


int match_bes_to_clone(char* query, int* cidx, struct bss_instance* pbsi);
extern void safe_strncpy(char* dest, char* src, size_t num);


/* DONT GIVE TWO OF THESE THE SAME NAME */
char bss_headers[BSS_NUMTYPES][BSS_FIELD_MAXSIZE+1] = {
                        "Target",
                        "SeqCtg",
                        "CtgNum",
                        "CtgEnd",
                        "Contig",
                        "Query_len",
                        "Query_start",
                        "Query_end",  
                        "Targ_len",
                        "Targ_start",
                        "Targ_end",
                        "Hit_len",
                        "Query",
                        "RC",
                        "Clone",
                        "Score",
                        "EValue",
                        "Identity",
                        "Match",
                        "Intron",
                        "SeqClone",
                        "Query",
                        "Target"
                        };



enum  bss_datatype bss_column_order[BSS_NUMTYPES];

char bss_col_widths[BSS_NUMTYPES] = {
    20, // BES   
    8,  // SEQCTG
    8,  // CTG
    8,  // LR
    8,  // CTGLR
    12, // QLEN 
    12, // QSTART 
    12, // QEND
    12, // TLEN
    12, // TSTART 
    12, // TEND
    12, // HITLEN
    20, // MARKER
    5,  // RC
    20, // CLONE
    8,  // SCORE
    8,  // EVALUE
     10,  // IDENTITY,
     10,  // MATCH,
     10,  // INTRON,
     20,  // SEQ,
     50,  // QUERY,which may be a long genbank header
     50  // TARGET,  likewise
};

char bss_col_widths_px[BSS_NUMTYPES] = {
    100, // BES   
    50,  // SEQCTG
    50,  // CTG
    50,  // LR
    50,  // CTGLR
    80, // QLEN 
    80, // QSTART 
    80, // QEND
    80, // TLEN
    80, // TSTART 
    80, // TEND
    80, // HITLEN
    100, // MARKER
    30,  // RC
    100, // CLONE
    50,  // SCORE
    50,  // EVALUE
     50,  // IDENTITY,
     50,  // MATCH,
     50,  // INTRON,
     100,  // SEQ,
     100,  // QUERY,
     100  // TARGET,
};


/* these columns can't be unchecked because we need them */
char bss_mandatory_cols[BSS_NUMTYPES] = {
    1, // BES   
    1,  // SEQCTG
    1,  // CTG
    0,  // LR
    1,  // CTGLR
    1, // QLEN 
    0, // QSTART 
    0, // QEND
    0, // TLEN
    0, // TSTART 
    0, // TEND
    0, // HITLEN
    1, // MARKER
    0,  // RC
    1, // CLONE
    0,  // SCORE
    0,  // EVALUE
     0,  // IDENTITY,
     0,  // MATCH,
     0,  // INTRON,
     1,  // SEQ,
     1,  // QUERY,
     1  // TARGET,
};



void create_ctgtotals_array(struct ctgTotalsParm **scores, int num_ctgs){
    int i;

    *scores=malloc(sizeof(struct ctgTotalsParm)*(num_ctgs));
    NOMEM2(*scores,"scores");
    /*Init array*/
    for(i=0;i<num_ctgs;i++){
        (*scores)[i].cst=0;
        (*scores)[i].cloneHits=0;
        (*scores)[i].contig=0;
    }
}

struct ctgsHitlistType* find_ctg_hit(int ctg, struct ctgsHitlistType *list){

    struct ctgsHitlistType *ptr;

    ptr = list;
    while(ptr != NULL){
        if(ptr->contig == ctg) return ptr;
        ptr = ptr->next;
    }
    return 0;
}

void add_to_ctgsHitlist(int ctg, struct ctgsHitlistType **plist){

    /*struct ctgsHitlistType *ptr;*/
    struct ctgsHitlistType *ptrnew;

    ptrnew=malloc(sizeof(struct ctgsHitlistType));
    NOMEM2(ptrnew,"ctgsHitlistType");
    ptrnew->contig=ctg;
    ptrnew->cst=0;
    ptrnew->cloneHits=0;
    ptrnew->next=*plist;
    *plist = ptrnew;

}

int check_in_ctg2(int c, int* ctg, char** clonename, char* LR)
{
    CLONE *clp;
    int result = 0;

    clp = arrp(acedata,c,CLONE);
    if (1) // CTG0  clp->ctg > 0)
    {
        *ctg = clp->ctg;
        *clonename = clp->clone;
        *LR = '-';
        if (*ctg > 0)
        {
            if (clp->x - Pz.fromendInt <= contigs[*ctg].left)
            {
                *LR = 'L';
            }
            else if (clp->y + Pz.fromendInt >= contigs[*ctg].right)
            {
                *LR = 'R';
            }
        }
        result = 1;
    }
    
    return result;
}

int check_in_ctg(char* name, int* ctg, char** clonename, char* LR, FPC_TREE* pdbtree)
{
    int result = 0;
    SEQ_DATA* pdata;

    pdata = fpc_tree_find_data(pdbtree,name);   
    if (pdata)
    {
        if (pdata->cidx >= 0)
        {
            return check_in_ctg2(pdata->cidx,ctg,clonename,LR);
        }        
    }
    
    return result;
}

void delete_query_hit_data(void** ppdata)
{

    assert(ppdata);
    assert (*ppdata);

    /* note we don't free the ctg hit list as it is transferred to 
        the array structure */

    free(*ppdata);
    *ppdata = 0;
}

void destroy_bss_instance(struct bss_instance* pbsi)
{
    if (pbsi->bss_hits) arrayDestroy(pbsi->bss_hits);
    if (pbsi->bss_ctghits) arrayDestroy(pbsi->bss_ctghits);
    if (pbsi->bss_queryhits) arrayDestroy(pbsi->bss_queryhits);
    if (pbsi->dbs) g_array_free(pbsi->dbs,TRUE);
    memset(pbsi,0,sizeof(struct bss_instance));
}

int parse_seqctg(char* in, char* seqname, int* pctg)
{
    char* ptr;
    char c;
    char* test = strdup(in);

    for (ptr = test; *ptr != 0; ptr++)
    {
        *ptr = tolower(*ptr);
    }

    assert(in);assert(seqname);assert(pctg);
    if (2 == sscanf(in,"%[^.].%d",seqname,pctg))
    {
        return 1;
    } 

    if ( (ptr = strstr(test,"contig")))
    {
        if (ptr > test)
        {
            ptr--;
            c = *ptr;
            *ptr = 0;
            strcpy(seqname,test);
            *ptr = c;
            ptr++;
        }
        ptr += strlen("contig");
        if (1 == sscanf(ptr,"%d",pctg))
        {
            return 1;
        }  
    }     
    *pctg = 1;
    return 0;
}

void parse_blast_output(struct bss_instance* pbsi, char* blastfile, FPC_TREE* pqtree, FPC_TREE* pdbtree)
{
    char* ptr;
    char oneline[1024];
    char query_id[1024];
    char subject_id[1024];
    double pct_identity;
    int alignment_length;
    int mismatches;
    int gap_openings;
    int qstart;
    int qend;
    int tstart;
    int tend;
    char evalue[100];
    int bit_score;
    BSS_HIT* hit;
    SEQ_DATA* psdataq;
    SEQ_DATA* psdatat;
    int qlen;
    int tlen;
    FILE* infile;
    int n_hits = 0;
    int hit_ctg;
    char* clonename;
    int itemp;
    char LR;
    int pct_match;

    infile = fopen(blastfile,"r");
    if (!infile)
    {
        messout("Failed to open blast output file %s\n",blastfile);
        return;
    }    

    printf("parsing %s\n", blastfile);fflush(0);   
    pbsi->bss_hits = arrayCreate(0, BSS_HIT) ;

    n_hits = 0;
    while (fgets(oneline, sizeof(oneline), infile) != NULL) 
    {
        if (12 != sscanf(oneline,
                         "%s %s %lf %d %d %d %d %d %d %d %s %d ",
                         query_id, subject_id, &pct_identity,
                         &alignment_length, &mismatches, &gap_openings,
                         &qstart, &qend, &tstart,
                         &tend, evalue, &bit_score))
        {
            continue;
        }
        pbsi->total_hits++;


        // if megablast has appended "lcl|", get rid of it
        if (pbsi->program == MEGABLAST && 0 == strncmp(subject_id,"lcl|",4))
        {
            ptr = subject_id + 4;
            memmove(subject_id, ptr, strlen(ptr) + 1);
        }
        
        // If we're dealing with a genbank header, strip out the last field.
        // (For megablast, it already parses out one of the fields...see "collect_fasta_seq_data"
        // in dosearch.c)
        
        if (0 == strncmp(query_id,"gi|",3))
        {
            ptr = strrchr(query_id, '|');
            ptr++;
            memmove(query_id, ptr, strlen(ptr) + 1);
        }
        if (0 == strncmp(subject_id,"gi|",3))
        {
            ptr = strrchr(subject_id, '|');
            ptr++;
            memmove(subject_id, ptr, strlen(ptr) + 1);
        }

        psdataq = fpc_tree_find_data(pqtree,query_id);
        if (!psdataq)
        {
            fprintf(stderr,"Found hit for unknown query sequence %s\n",query_id);fflush(0);
            continue;
        }
        qlen = psdataq->len;

        psdatat = fpc_tree_find_data(pdbtree,subject_id);
        if (!psdatat)
        {
            fprintf(stderr,"Found hit for unknown database sequence %s\n",subject_id);fflush(0);
            continue;
        }
        tlen = psdatat->len;

        if (!check_in_ctg(subject_id, &hit_ctg, &clonename, &LR, pdbtree)) continue;
        
        if (Zbatch_flag && pbsi->batch_search_type == CURCTG && hit_ctg != pbsi->batch_ctg)
        {
            continue;
        }    
        if (Zbatch_flag && pbsi->batch_search_type == ENDS && LR == '-')
        {
            continue;
        }    
          
        hit = arrayp(pbsi->bss_hits,n_hits,BSS_HIT);
        memset(hit,0,sizeof(hit));
        hit->order = n_hits;
        n_hits++;

        hit->data[BSS_QUERY].str  = strdup(psdataq->name_org);
        hit->data[BSS_TARGET].str  = strdup(psdatat->name_org);

        hit->data[BSS_CTG].i      = hit_ctg;
        hit->data[BSS_LR].str = g_strdup_printf("%c",LR);
        if (LR != '-')
        {
            hit->data[BSS_CTGLR].str = g_strdup_printf("%d%c",hit_ctg,LR);
        }
        else
        {
            hit->data[BSS_CTGLR].str = g_strdup_printf("%d",hit_ctg);
        }        

        hit->data[BSS_CLONE].str  = strdup(clonename);
        hit->data[BSS_IDENTITY].i = rint(pct_identity) + 0.1;
        hit->data[BSS_SCORE].i    = bit_score;
        hit->data[BSS_EVALUE].str = strdup(evalue);

        if (tend < tstart)
        {                
            hit->data[BSS_RC].str = strdup("y");
            itemp = tend; tend = tstart; tstart = itemp;
        }
        else
        {
            hit->data[BSS_RC].str = strdup("n");
        }
        hit->data[BSS_TSTART].i   = tstart; 
        hit->data[BSS_TEND].i     = tend; 
        hit->data[BSS_QSTART].i   = qstart; 
        hit->data[BSS_QEND].i     = qend; 
        hit->data[BSS_QLEN].i     = qlen; 
        hit->data[BSS_TLEN].i     = tlen; 

        pct_match = (qlen < tlen ? (100*(qend-qstart+1))/qlen : (100*(tend-tstart+1))/tlen);
        hit->data[BSS_MATCH].i    = pct_match;

        hit->data[BSS_SEQCTG].i    = psdataq->seqctg;


        hit->data[BSS_BES].str = strdup(subject_id); 
        hit->data[BSS_MARKER].str = strdup(query_id);

        pbsi->retained_hits++;
    }

    fclose(infile);
    remove(blastfile);

    set_column_order(pbsi);   
    write_bss_file(pbsi,1);
}
void getStringField(char* line,int field,char* dest,int max) {
    char* ptr1 = line;
    char* ptr2;
    int fieldnum = 0;
    int infield = 0;
    int c;

    /* zero the output and get to the start of first field */
    *dest = 0;
    while (isspace((int)*ptr1) && *ptr1 != 0) {
        ptr1++;
    }
    if (*ptr1 == 0) {
        return;
    }
    ptr2 = ptr1;
    infield = 1;
    /* go through characters one by one */
    while (*ptr1 != 0) {
        if (!infield && (!isspace((int)*ptr1) && *ptr1 != ',')) {
            infield = 1;
            fieldnum++;
            ptr2 = ptr1;
        }
        else if (infield && (isspace((int)*ptr1) || *ptr1 == ',')) {
            infield = 0;
            if (fieldnum == field) {
                c = *ptr1;
                *ptr1 = 0;
                strncpy(dest,ptr2,max);
                *ptr1 = c;
                break;
            }
        }
        ptr1++;
    }
    return;
}

void getIntField(char* line,int field,int* dest) {
    char tmp[255];
    *dest = 0;
    getStringField(line,field,tmp,255);
    *dest = atoi(tmp);
    return;
}
/* find the largest intron */
void getMaxTGap(char* line,int* dest) {
    int blocks;
    int block;
    int bsize = 0;
    int tstart = 0;
    int tend = -1;
    int gap = 0;

    *dest = 0;
    getIntField(line,17,&blocks);
    if (blocks > 0){
        for (block = 0; block < blocks; block++){
            getIntField(line,18+block, &bsize);
            getIntField(line,18+block+2*blocks, &tstart);
            if (tend > -1){
                gap = tstart - tend;
                if (gap > *dest){
                    *dest = gap;
                }
            }
            tend = tstart + bsize; /* first base after the block */
        }
    }
}

void parse_blat_output(struct bss_instance* pbsi, char* blastfile, FPC_TREE* pqtree, FPC_TREE* pdbtree)
{
    char oneline[1024];
    char query_id[1024];
    char subject_id[1024];
    double pct_identity;
    int qstart;
    int qend;
    int tstart;
    int tend;
    BSS_HIT* hit;
    SEQ_DATA* psdataq, *psdatat;
    int qlen;
    int tlen;
    FILE* infile;
    int n_hits = 0;
    int hit_ctg;
    char* clonename;
    int pct_match;
    int match;
    int mismatch;
    int rmatch;
    int nN;
    int nqgap;
    int qgap_bp;
    int ntgap;
    int tgap_bp;
    char strand;
    char LR;
    char* ptr;

    infile = fopen(blastfile,"r");
    if (!infile)
    {
        messout("Failed to open blat output file %s\n",blastfile);
        return;
    }   

    printf("parsing %s\n", blastfile);fflush(0);   
    
    pbsi->bss_hits = arrayCreate(0, BSS_HIT) ;

    n_hits = 0;
    while (fgets(oneline, sizeof(oneline), infile) != NULL) 
    {
        if (sscanf(oneline,"%u %u %u %u %u %u %u %u %c %s %u %u %u %s %u %u %u",
                         &match, &mismatch, &rmatch, &nN, &nqgap, &qgap_bp, &ntgap, &tgap_bp,
                        &strand, query_id, &qlen, &qstart, &qend, 
                        subject_id, &tlen, &tstart, &tend) != 17) continue;
        pbsi->total_hits++;

        match += rmatch;

        // If we're dealing with a genbank header, strip out the last field 
        if ((ptr = strrchr(query_id, '|')))
        {
            ptr++;
            memmove(query_id, ptr, strlen(ptr) + 1);
        }
        if ((ptr = strrchr(subject_id, '|')))
        {
            ptr++;
            memmove(subject_id, ptr, strlen(ptr) + 1);
        }

        psdataq = fpc_tree_find_data(pqtree,query_id);
        if (!psdataq)
        {
            fprintf(stderr,"Found hit for unknown query sequence %s\n",query_id);fflush(0);
            continue;
        }

        psdatat = fpc_tree_find_data(pdbtree,subject_id);
        if (!psdatat)
        {
            fprintf(stderr,"Found hit for unknown database sequence %s\n",subject_id);fflush(0);
            continue;
        }

        if (!check_in_ctg(subject_id, &hit_ctg, &clonename, &LR, pdbtree)) continue;                

        if (Zbatch_flag && pbsi->batch_search_type == CURCTG && hit_ctg != pbsi->batch_ctg)
        {
            continue;
        }    
        if (Zbatch_flag && pbsi->batch_search_type == ENDS && LR == '-')
        {
            continue;
        }    

        hit = arrayp(pbsi->bss_hits,n_hits,BSS_HIT);
        memset(hit,0,sizeof(hit));
        hit->order = n_hits;
        n_hits++;

        hit->data[BSS_QUERY].str = strdup(psdataq->name_org);
        hit->data[BSS_TARGET].str = strdup(psdatat->name_org);

        hit->data[BSS_CTG].i      = hit_ctg;
        hit->data[BSS_LR].str = g_strdup_printf("%c",LR);
        if (LR != '-')
        {
            hit->data[BSS_CTGLR].str = g_strdup_printf("%d%c",hit_ctg,LR);
        }
        else
        {
            hit->data[BSS_CTGLR].str = g_strdup_printf("%d",hit_ctg);
        }    

        hit->data[BSS_CLONE].str  = strdup(clonename);

        pct_identity = (100*match)/(match + mismatch);
        pct_match = (qlen < tlen ? (100*match)/qlen : (100*match)/tlen);

        hit->data[BSS_IDENTITY].i = pct_identity;
        hit->data[BSS_SCORE].i    = match;
        hit->data[BSS_MATCH].i    = pct_match;
         
        hit->data[BSS_RC].str = (strand == '+' ? strdup("n") : strdup("y"));

        hit->data[BSS_TSTART].i   = tstart; 
        hit->data[BSS_TEND].i     = tend; 
        hit->data[BSS_QSTART].i   = qstart; 
        hit->data[BSS_QEND].i     = qend; 
        hit->data[BSS_QLEN].i     = qlen; 
        hit->data[BSS_TLEN].i     = tlen; 

        hit->data[BSS_SEQCTG].i    = psdataq->seqctg;

        getMaxTGap(oneline,&(hit->data[BSS_INTRON].i));


        hit->data[BSS_BES].str = strdup(subject_id); 
        hit->data[BSS_MARKER].str = strdup(query_id);

        pbsi->retained_hits++;
    }

    fclose(infile);
    remove(blastfile);

    set_column_order(pbsi);   
    write_bss_file(pbsi,1);
}





